In [20]:
## This code cell will not be shown in the HTML version of this notebook
# imports from custom library
import sys
sys.path.append('../../')
from mlrefined_libraries import superlearn_library as superlearn
from mlrefined_libraries import math_optimization_library as optlib
from mlrefined_libraries import nonlinear_superlearn_library as nonlib
import matplotlib.pyplot as plt
from autograd.misc.flatten import flatten_func

# demos for this notebook
regress_plotter = superlearn.lin_regression_demos
optimizers = optlib.optimizers
normalizers = superlearn.normalizers 
static_plotter = optlib.static_plotter.Visualizer()
linear_datapath = '../../mlrefined_datasets/superlearn_datasets/'
nonlinear_datapath = '../../mlrefined_datasets/nonlinear_superlearn_datasets/'

# import autograd functionality to bulid function's properly for optimizers
import autograd.numpy as np

# import timer
from datetime import datetime 
import copy

# this is needed to compensate for %matplotlib notebook's tendancy to blow up images when plotted inline
%matplotlib notebook
from matplotlib import rcParams
rcParams['figure.autolayout'] = True

%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Linear

In [2]:
# load data
csvname = linear_datapath + 'unnorm_linregress_data.csv'
data = np.loadtxt(csvname,delimiter = ',')
x = data[:,:-1].T
y = data[:,-1:] 

# plot dataset
plt.figure(figsize = (5,3))
plt.scatter(x,y,color = 'k',edgecolor = 'w')
plt.show()

Model and cost

In [3]:
# compute linear combination of input point
def model(x,w):
    # tack a 1 onto the top of each input point all at once
    o = np.ones((1,np.shape(x)[1]))
    x = np.vstack((o,x))
    
    # compute linear combination and return
    a = np.dot(x.T,w)
    return a

# an implementation of the least squares cost function for linear regression
def least_squares(w):
    cost = np.sum((model(x,w) - y)**2)
    return cost/float(len(y))

Optimizer

In [4]:
# run gradient descent to minimize the Least Squares cost for linear regression
g = least_squares; w = 0. 1*np.random.randn(x.shape[0]+1,1); max_its = 100; alpha_choice = 10**(-1);
weight_history,cost_history = optimizers.gradient_descent(g,alpha_choice,max_its,w)
In [5]:
# plot the cost function history for a given run
static_plotter.plot_cost_histories([cost_history],start = 0,points = False,labels = ['run 1'])
In [171]:
def plot_model(x,y,w,model,**kwargs):
    # scatter plot the input data
    plt.figure(figsize = (5,3))
    plt.scatter(x,y,color = 'k',edgecolor = 'w')
    xmin = np.min(x); xmax = np.max(x); xgap = (xmax - xmin)*0.1;
    xmin -= xgap; xmax += xgap
    s = np.linspace(xmin,xmax,100)[np.newaxis,:]
    transformer = lambda data: data
    if 'transformer' in kwargs:
        transformer = kwargs['transformer']
    t = model(transformer(s),w)
    plt.plot(s.T,t,linewidth = 2.5,color = 'r')
    plt.show()
In [7]:
plot_model(x,y,weight_history[-1],model)

Nonlinear

In [176]:
# load data
csvname = nonlinear_datapath + 'noisy_sin_sample.csv'
data = np.loadtxt(csvname,delimiter = ',')
x = data[:,:-1].T
y = data[:,-1:] 

# plot dataset
plt.figure(figsize = (5,3))
plt.scatter(x,y,color = 'k',edgecolor = 'w')
plt.show()
In [180]:
def feature_transform(x,w):
    # calculate feature transform
    f = np.sin(np.dot(x.T,w)).T
    
    # tack a 1 onto the top of each input point all at once
    o = np.ones((1,np.shape(f)[1]))
    f = np.vstack((o,f))
    return f

# compute linear combination of input point
def model(x,w):
    # tack a 1 onto the top of each input point all at once
    o = np.ones((1,np.shape(x)[1]))
    x = np.vstack((o,x))
    
    # feature transformation 
    f = feature_transform(x,w[0])
    
    # compute linear combination and return
    a = np.dot(f.T,w[1])
    return a

# an implementation of the least squares cost function for linear regression
def least_squares(w):
    cost = np.sum((model(x,w) - y)**2)
    return cost/float(len(y))
In [178]:
# run gradient descent to minimize the Least Squares cost for linear regression
g = least_squares; max_its = 400; alpha_choice = 10**(-1);

# initialization
w_init = [0.1*np.random.randn(x.shape[0]+1,1), 
    0.1*np.random.randn(x.shape[0]+1,1)]

# flatten g since we have layers of weights
flat_g, unflatten, w = flatten_func(g, w_init)
In [181]:
# run gradient descent
weight_history_1,cost_history_1 = optimizers.gradient_descent(flat_g,alpha_choice,max_its,w)
In [182]:
# plot the cost function history for a given run
static_plotter.plot_cost_histories([cost_history_1],start = 0,points = False,labels = ['run 1'])
In [183]:
final_weight = weight_history_1[-1]
w = unflatten(final_weight)
plot_model(x,y,w,model)
In [184]:
# create normalizer/inverse normalizer
normalizer = normalizers.standard_normalizer(x)

# make a copy of the original
x_orig = copy.deepcopy(x)

# normalize input - rewrite variable x as normalized data
x = normalizer(x)
In [185]:
# run gradient descent to minimize the Least Squares cost for linear regression
g = least_squares; 

# flatten g since we have layers of weights
flat_g, unflatten, w = flatten_func(g, w_init)
In [186]:
# run gradient descent
weight_history_2,cost_history_2 = optimizers.gradient_descent(flat_g,alpha_choice,max_its,w)
In [187]:
# plot the cost function history for a given run
static_plotter.plot_cost_histories([cost_history_1,cost_history_2],start = 0,points = False,labels = ['run 1','run 2'])
In [188]:
final_weight = weight_history_2[-1]
w = unflatten(final_weight)
plot_model(x_orig,y,w,model,transformer = normalizer)

fixed basis

In [222]:
# load data
csvname = nonlinear_datapath + 'noisy_sin_sample.csv'
data = np.loadtxt(csvname,delimiter = ',')
x = data[:,:-1].T
y = data[:,-1:] 
In [223]:
def feature_transform(x,D):
    # calculate feature transform
    f = np.array([(x.flatten()**d) for d in range(1,D+1)])   
    
    # tack a 1 onto the top of each input point all at once
    o = np.ones((1,np.shape(f)[1]))
    f = np.vstack((o,f))
    return f
In [224]:
# compute linear combination of input point
def model(x,w):    
    # feature transformation 
    f = feature_transform(x,D)
    
    # compute linear combination and return
    a = np.dot(f.T,w)
    return a

# an implementation of the least squares cost function for linear regression
def least_squares(w):
    cost = np.sum((model(x,w) - y)**2)
    return cost/float(len(y))
In [225]:
D = 3; w = 0.1*np.random.randn(D+1,1); g = least_squares; alpha_choice = 10**(-1); max_its = 500;
In [226]:
# run gradient descent
weight_history_1,cost_history_1 = optimizers.gradient_descent(g,alpha_choice,max_its,w)
In [227]:
# plot the cost function history for a given run
static_plotter.plot_cost_histories([cost_history_1],start = 0,points = False,labels = ['run 1'])
In [229]:
w = weight_history_1[-1]
plot_model(x_orig,y,w,model)
In [230]:
# create normalizer/inverse normalizer
normalizer = normalizers.standard_normalizer(x)

# make a copy of the original
x_orig = copy.deepcopy(x)

# normalize input - rewrite variable x as normalized data
x = normalizer(x)
In [231]:
# load up
g = least_squares; alpha_choice = 10**(-2);

# run gradient descent
weight_history_2,cost_history_2 = optimizers.gradient_descent(g,alpha_choice,max_its,w)
In [232]:
# plot the cost function history for a given run
static_plotter.plot_cost_histories([cost_history_1,cost_history_2],start = 0,points = False,labels = ['run 1','run 2'])
In [233]:
w = weight_history_2[-1]
plot_model(x_orig,y,w,model,transformer = normalizer)

Example 6. A population growth example

In [21]:
# create instance of linear regression demo, used below and in the next examples
demo2 = nonlib.nonlinear_regression_visualizer.Visualizer(csvname = datapath + 'yeast.csv')

# plot dataset
demo2.plot_data(xlabel = 'time', ylabel = 'yeast population')

If we take a moment and visually examine this dataset, it appears that some sort of logistic tangent nonlinearity would fit it well. So setting our prediction function to a fully parameterized tanh function

\begin{equation} \text{predict}(x,\omega) = w_0 + w_1\text{tanh}\left(w_2 + w_3x\right) \end{equation}

where here our entire set of weights is denoted $\omega = \left \{w_0,w_1,w_2,w_3 \right \}$ and

\begin{equation} f\left(x, w_2,w_3\right) = \text{tanh}\left(w_2 + w_3x\right) \end{equation}

is our feature of the input, we can once again form and minimize the Least Squares cost function written generically as $g\left(\mathbf{w}\right) = \sum_{p = 1}^P \left(\text{predict}\left(x_p,\omega\right) - y_p\right)^2$.

With each of our notations for the feature transformation, predict function, and Least Squares cost we can once again write out everything in Python in a modular style.

In [22]:
# nonlinear feature transformation
def f(x,w):
    # shove through nonlinearity
    f_val = np.tanh( w[2] + w[3]*x)
    return f_val

# prediction function
def predict(x,w):
    # linear combo
    val = w[0] + w[1]*f(x,w)
    return val

# least squares
def least_squares(w):
    cost = 0
    for p in range(0,len(y)):
        x_p = x[p]
        y_p = y[p]
        cost +=(predict(x_p,w) - y_p)**2
    return cost

Now we can center our data and minimize the Least Squares cost function via normalized gradient descent.

In [23]:
# get and center data
demo2.center_data(); x = demo2.x; y = demo2.y;

# declare an instance of our current our optimizers
opt = nonlib.optimimzers.MyOptimizers()

# run desired algo with initial point, max number of iterations, etc.
w_best = opt.gradient_descent(g = least_squares,w = np.random.randn(4,1),max_its = 1000,steplength_rule = 'diminishing',version = 'normalized',output = 'best')

With our minimization complete we can then fit our predict function in both the original space (where it provides a good nonlinear fit) as well as in the transformed feature space where it simultaneously provides a good linear fit to the transformed data (as discussed in the previous example).

In [24]:
# static transform image
demo2.static_img(w_best,least_squares,predict,f1_x = [f(v,w_best) for v in x])

Example 7. Galileo's ramp experiment

In 1638 Galileo Galilei, infamous for his expulsion from the Catholic church for daring to claim that the earth orbited the sun and not the converse (as was the prevailing belief at the time) published his final book: Discourses and Mathematical Demonstrations Relating to Two New Sciences. In this book, written as a discourse among three men in the tradition of Aristotle, he described his experimental and philosophical evidence for the notion of uniformly accelerated physical motion. Specifically, Galileo (and others) had intuition that the acceleration of an object due to (the force we now know as) gravity is uniform in time, or in other words that the distance an object falls is directly proportional (i.e., linearly related) to the amount of time it has been traveling, squared. This relationship was empirically solidified using the following ingeniously simple experiment performed by Galileo.

Repeatedly rolling a metal ball down a grooved $\frac{1}{2}$ meter long piece of wood set at an incline as shown in the Figure below, Galileo timed how long the ball took to get $\frac{1}{4}$,$\frac{1}{2}$, $\frac{2}{3}$, $\frac{3}{4}$, and all the way down the wood ramp.

Figure 5: Figurative illustration of Galileo's ramp experiment setup used for exploring the relationship between time and the distance an object falls due to gravity. To perform this experiment he repeatedly rolled a ball down a ramp and timed how long it took to get $\frac{1}{4}$,$\frac{1}{2}$, $\frac{2}{3}$, $\frac{3}{4}$, and all the way down the ramp.

Why didn't Galileo simply drop the ball from some height and time how long it took to reach certain distances to the ground? Because no reliable way to measure time yet existed (he had to use a water clock for these experiments)! Galileo was the one who set humanity on the route towards its first reliable time-piece in his studies of the pendulum

Data from a (modern reenactment) of these experiments (averaged over 30 trials), results in the 6 data points shown below.

In [25]:
# create instance of linear regression demo, used below and in the next examples
demo3 = nonlib.nonlinear_regression_visualizer.Visualizer(csvname = datapath + 'galileo_ramp_data.csv')

# plot dataset
demo3.plot_data(xlabel = 'time (in seconds)',ylabel = 'portion of ramp traveled')

The data here certainly displays a nonlinear relationship and by viewing it - and using his physical intuition Galileo - intuited a quadratic relationship. Or in our jargon that for some $w_0$, $w_1$, and $w_2$ the prediction function

\begin{equation} \text{predict}(x,\omega) = w_0 + w_1x + w_2x^2 \end{equation}

provides the correct sort of nonlinearity to explain this data (albeit when the parameters are tuned correctly). Note here the weight set $\omega = \left \{w_0,w_1,w_2 \right \}$.

Notice here how we have 2 feature transformations: the identity $f_1(x) = x$ and the quadratic term $f_2(x) = x^2$, and so we may write the above equivalently as

\begin{equation} \text{predict}(x,\omega) = w_0 + w_1\,f_1(x) + w_2\,f_2(x) \end{equation}

which clearly shows how we are seeking out a proper linear relationship in the transformed feature space (which in this case is two-dimensional).

We express each feature transformation, the predict function, and our Least Squares cost function in Python in the following cell.

In [27]:
# feature transformations
def f1(x):
    return x

def f2(x):
    return x**2
    
# prediction function
def predict(x,w):
    # linear combo
    a = w[0] + w[1]*f1(x) + w[2]*f2(x)
    return a

# least squares
def least_squares(w):
    cost = 0
    for p in range(0,len(y)):
        x_p = x[p]
        y_p = y[p]
        cost +=(predict(x_p,w) - y_p)**2
    return cost

And we optimize using e.g., (unnormalized) gradient descent. In this case - as is the case whenever we use feature transformations with no internal parameters to tune (which we discuss further in our series of posts on kernels / boosted trees) - the resulting cost function is convex.

In [28]:
# get and center data
demo3.center_data(); x = demo3.x; y = demo3.y;

# declare an instance of our current our optimizers
opt = nonlib.optimimzers.MyOptimizers()

# run desired algo with initial point, max number of iterations, etc.,
w_best = opt.gradient_descent(g = least_squares,alpha = 10**(-2),w = np.random.randn(3,1),max_its = 2000,version = 'unnormalized',output = 'best')

Now we can plot our original data and nonlinear fit in the original space (left panel below), as well as transformed data and simultaneous linear fit in the transformed feature space (right panel below).

In [30]:
# static transform image
f1_x = [f1(v) for v in x]; f2_x = [f2(v) for v in x]
demo3.static_img(w_best,least_squares,predict,f1_x=f1_x,f2_x=f2_x,view = [35,100])

Notice again that since we have two features in this instance our linear fit is in a space one dimension higher than the original input space defined by $x$. In other words, the transformed feature space here has two inputs: one defined by each of the two features $f_1$ and $f_2$.

This is true more generally speaking: the more feature transforms we use the higher the up we go in terms of the dimensions of our transformed feature space / linear fit! In general if our original input has dimension $N$ - and is written as $\mathbf{x}$ - and we use a predict function that employs $B$ nonlinear feature transformations as

\begin{equation} \text{predict}\left(\mathbf{x},\omega\right) = w_0 + {w}_{1}\,f_1\left(\mathbf{x}\right) + {w}_{2}\,f_2\left(\mathbf{x}\right) + \cdots + w_B\,f_B\left(\mathbf{x}\right) \end{equation}

then our original space has $N$ dimensional input, while our transformed feature space is $B$ dimensional. Note here that the set of all weights $\omega$ contains not only the weights $w_1,\,w_2,...,w_B$ from the linear combination, but also any features's internal parameters as well.

2.2 Introductory nonlinear classification examples

We first examine a number of examples of nonlinear classification, summarizing our discoveries afterwards.

Example 8. A one dimensional example

In discussing classification through the lens of logistic regression, we saw how linear classification can be thought of as a particular instance of nonlinear regression. In particular how from this perspective we aim at fitting a curve (or surface in higher dimensions) that consists of a linear combination of our input shoved through the tanh function. For $N=1$ dimensional input this regression looks like

\begin{equation} \text{tanh}\left(w_0^{\,} + {w}_{1}^{\,}{x}_p\right) \approx y_p. \end{equation}

Our predict function here is simply the linear combination

\begin{equation} \text{predict}(x,\omega) = w_0 +_{\,} w_1x_{\,} \end{equation}

which takes in a point $x$ and the set of weights $\omega$ for our model, which here $\omega = \left \{ w_0,w_1 \right \}$. With classification this function defines the linear decision boundary - a single point in this instance - where $\text{predict}(x,\omega) = 0$. As we discussed previously this decision boundary provides us with predicted labels for every possible input. In particular if $\text{predict}(x) > 0$ then $x$ assigned to $+1$ class, if $\text{predict}(x) < 0$ assigned to $-1$ class. This is illustrated in the Figure below. We then tune these parameters by minimizing e.g., the softmax cost.

Figure 6: A prototypical $N = 1$ dimensional input shown from the regression perspective (top panel) and from 'above' (bottom panel) where label values are illustrated as colors (red for $+1$ and blue for $-1$). A logistic regression fit providing perfect separation is shown in the top panel, along with the input line to `tanh` that defines the decision boundary for this problem (where this line pierces the input space). Predictions are then made in the future based geometrically on this line / decision boundary, with predicted regions annotated in the bottom panel.

However a linear predictor - a linear decision boundary - is quite inflexible in general and fails to provide good separation even in the simple example below. Here we clearly need a predict function that can cross the input space (the x axis) twice at points separated by a large distance - something a line could never do. This dataset is shown from the 'regression' perspective in the top panel - with output plotted explicitly - and 'from above' in the bottom panel with output denoted by color (with the usual scheme).

In [29]:
# create instance of linear regression demo, used below and in the next examples
demo4 = nonlib.nonlinear_classification_visualizer.Visualizer(csvname = datapath + 'signed_projectile.csv')

# plot dataset
demo4.plot_data()

What sort of simple function crosses the horizontal axis twice? How about a quadratic function? If adjusted to the right height a quadratic certainly can be made to cross the horizontal axis twice and - when shoved through a tanh - could indeed give us the sort of predictions we desire. This idea is drawn figuratively for a dataset like the one above in the Figure below.

Figure 7: An example dataset like the one we are currently looking to learn a classifier on (shown from the regression perspective in the top panel, and 'from above' in the bottom panel). Here we show what a quadratic predictor could achieve in terms of providing a proper decision boundary consisting of two distinct and separate points, with the proper regions of the space classified accordingly.

Using a generic quadratic function as our predictor takes the form

\begin{equation} \text{predict}(x,\omega) = w_0 + w_1x^{\,} + w_2x^2 \end{equation}

where here the set of weights $\omega = \left\{w_0,w_1,w_2\right\}$. Here we have two feature transformations (we will write explicitly in code)

\begin{equation} \text{feature transformations}: f_1(x) = x \,\,\,\,\,\, f_2(x) = x^2 \end{equation}

so we can write our predictor equivalently as

\begin{equation} \text{predict}(x,\omega) = w_0 + w_1\,f_1(x) + w_2\,f_2(x). \end{equation}

Notice here that - as with regression - while we think of this predictor as providing a nonlinear separation in the original space, it is indeed linear in the features themselves. Hence - as we will see - it will simultaneously provide a linear fit in the transformed feature space - that is the space whose input axes are defined via these features.

We then look to tune these weights by minimizing the softmax cost function (although we could use any other cost like e.g., the perceptron) as $\,g\left(\mathbf{w}\right) = \sum_{p=1}^{P} \text{log}\left(1 + e^{-y_p\, \text{predict}_{}\left(x_p\right)} \right)$. We write out each component of this functionality in Python in the next cell.

In [31]:
# feature transforms
def f1(x):
    return x

def f2(x):
    return x**2
    
# prediction function
def predict(x,w):
    # linear combo
    val = w[0] + w[1]*f1(x) + w[2]*f2(x)
    return val

# softmax cost
def softmax(w):
    cost = 0
    for p in range(0,len(y)):
        x_p = x[p]
        y_p = y[p]
        cost += np.log(1 + np.exp(-y_p*predict(x_p,w)))
    return cost

With our functionality defined we can then minimize the softmax cost using (unnormalized) gradient descent, as we do below.

In [32]:
# get data 
x = demo4.x; y = demo4.y;    

# declare an instance of our current our optimizers
opt = nonlib.optimimzers.MyOptimizers()

# run desired algo with initial point, max number of iterations, etc.,
w_best = opt.gradient_descent(g = softmax,w = np.random.randn(4,1),max_its = 2000,alpha = 10**(-1),version = 'normalized',output = 'best')

With our weights tuned and our predictor trained we can then plot the resulting fit / separation. In the left panel we show the original dataset - from the regression perspective- along with the nonlinear fit provided by our nonlinear logistic regressor $\text{tanh}\left(\text{predict}\left(x\right)\right) = y$. In the right panel we show the same dataset only in the transformed feature space defined by our two features. Here a datapoint that originally had input $x_p$ now has input $\left(f_1\left(x_p\right)\,,(f_2\left(x_p\right)\right)$. In this space the separation / decision boundary is linear.

In [35]:
# static transform image
demo4.static_N1_img(w_best,least_squares,predict,f1_x = [f1(s) for s in x], f2_x = [f2(s) for s in x],view = [25,15])

What we see with this example - a nonlinear decision boundary in the original space being simultaneously linear in the transformed feature space - always happens in practice if we have chosen our features well (i.e., as to provide a good nonlinear decision boundary in the original space).

Properly designed features provide good nonlinear separation in the original feature space and, simultaneously, good linear separation in the transformed feature space.

Also notice here that since we have used two features our feature space is one dimension larger than the original space. As was the case with regression this is true more generally speaking: the more feature transforms we use the higher the up we go in terms of the dimensions of our transformed feature space / linear separation! In general if our original input has dimension $N$ - and is written as $\mathbf{x}$ - and we use a predict function that employs $B$ nonlinear feature transformations as

\begin{equation} \text{predict}\left(\mathbf{x},\omega\right) = w_0 + {w}_{1}\,f_1\left(\mathbf{x}\right) + {w}_{2}\,f_2\left(\mathbf{x}\right) + \cdots + w_B\,f_B\left(\mathbf{x}\right) \end{equation}

then our original space has $N$ dimensional input, while our transformed feature space is $B$ dimensional. Note here that the set of all weights $\omega$ contains not only the weights $w_1,\,w_2,...,w_B$ from the linear combination, but also any features's internal parameters as well.

Example 8. A two-dimensional example

Let us examine the following $N=2$ input dataset below, visualized the regression perspective (left panel) and 'from above' (right panel).

In [33]:
# create instance of linear regression demo, used below and in the next examples
demo5 = nonlib.nonlinear_classification_visualizer.Visualizer(datapath + 'ellipse_2class_data.csv')

# an implementation of the least squares cost function for linear regression for N = 2 input dimension datasets
demo5.plot_data()

Visually examining the dataset it appears that some sort of elliptical decision boundary centered at the origin might do a fine job of classification. Thus we set our predictor function to the general parameterized form of such an ellipse, giving

\begin{equation} \text{predict}(x_1,x_2,\omega) = w_0^{\,} + w_1^{\,} x_1^2 + w_2^{\,}x_2^2. \end{equation}

Parsing this formula ,we can see that we have used two feature transformations $f_1(x_1,x_2)=x_1^2$ and $f_2(x_1,x_2) = x_2^2$. Expressing our predict function in terms of this notation we can see that our transformed feature space will have the same number of dimensions as our original space (i.e., two).

\begin{equation} \text{predict}(x_1,x_2,\omega) = w_0 + w_1\,f_1\,\left(x_1,x_2\right) + w_2\,f_2\,\left(x_1,x_2\right). \end{equation}

We write out each feature transformation, the prediction function, and softmax cost in Python below.

In [34]:
# feature transformations
def f1(x):
    return (x[0])**2

def f2(x):
    return (x[1])**2
    
# prediction function
def predict(x,w):
    # linear combo
    a = w[0] + w[1]*f1(x) + w[2]*f2(x)
    return a

# softmax cost
def softmax(w):
    cost = 0
    for p in range(0,len(y)):
        x_p = x[p,:]
        y_p = y[p]
        cost += np.log(1 + np.exp(-y_p*predict(x_p,w)))
    return cost

And now we minimize the softmax above via gradient descent.

In [36]:
# get data
x = demo5.x; y = demo5.y;   

# declare an instance of our current our optimizers
opt = nonlib.optimimzers.MyOptimizers()

# run desired algo with initial point, max number of iterations, etc.,
w_best = opt.gradient_descent(g = softmax,w = np.random.randn(3,1),max_its = 1000,alpha = 10**(-1),version = 'normalized',output = 'best')

With our weights tuned we can now plot the data in its original space (left panels) - along with the nonlinear decision boundary provided by the trained predictor - and in the transformed feature space (right panels) - where the corresponding decision boundary is linear. In each panel we color a region of space by what class our predictor assigns it post training. Indeed our presumption of an elliptical boundary seems like a good one here - as our classification results are quite good.

In [37]:
# illustrate results
demo5.static_N2_img(w_best,softmax,predict,f1,f2,view1 = [20,45],view2 = [20,30])

2.3 General conclusions

We have seen that when we can identify a candidate nonlinearity, we can quickly swap out linearity for nonlinearity in both our regression and classification paradigms. When doing this we often refer to each nonlinear function used as a 'feature' or 'feature transformation'. In particular, with regression we found that

A properly designed feature (or set of features) provides a good nonlinear fit in the original feature space and, simultaneously, a good linear fit in the transformed feature space.

Likewise for classification we found that

Properly designed features provide good nonlinear separation in the original feature space and, simultaneously, good linear separation in the transformed feature space.